# Kernel: hgemm_nn_32x64

# ******************************************************************************
# Copyright 2014-2018 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ******************************************************************************


<CONSTANT_MAPPING>
    addr_zero  : 4x<32*33*2 + 64*32*2>
    szShareA   : (32*33)
    szShareB   : (64*32)

    gridDimA : c[0x0][0x14]
    gridDimB : c[0x0][0x18]

    param_C[0]      : c[0x0][0x140]
    param_C[1]      : c[0x0][0x144]
    param_A[0]      : c[0x0][0x148]
    param_A[1]      : c[0x0][0x14c]
    param_B[0]      : c[0x0][0x150]
    param_B[1]      : c[0x0][0x154]
    param_alpha     : c[0x0][0x158]
    param_beta      : c[0x0][0x15c]
    param_flags     : c[0x0][0x160]
    param_lda       : c[0x0][0x164]
    param_ldb       : c[0x0][0x168]
    param_ldc       : c[0x0][0x16c]
    param_m         : c[0x0][0x170]
    param_n         : c[0x0][0x174]
    param_k         : c[0x0][0x178]
    param_ldaz      : c[0x0][0x17c]
    param_ldbz      : c[0x0][0x180]
    param_ldcz      : c[0x0][0x184]
    param_loops     : c[0x0][0x188]
</CONSTANT_MAPPING>

<REGISTER_MAPPING>

     3, 2,11,10,19,18,27,26 : cx<0-7>y0
     7, 6,15,14,23,22,31,30 : cx<0-7>y1
     1, 0, 9, 8,17,16,25,24 : cx<0-7>y2
     5, 4,13,12,21,20,29,28 : cx<0-7>y3
    35,34,43,42,51,50,59,58 : cx<0-7>y4
    39,38,47,46,55,54,63,62 : cx<0-7>y5
    33,32,41,40,49,48,57,56 : cx<0-7>y6
    37,36,45,44,53,52,61,60 : cx<0-7>y7

       0-63 : czero<00-63>
      64-79 : j0Ay<0-7>, j0Bx<0-7>
      80-95 : j1Ay<0-7>, j1Bx<0-7>

      64-95 ~ lda, ldb, ldb8, tidAX, tidAY, tidBX, tidBY, tidAY<1-3>, tidBY<8|16|24>, tid1, tid32, tb, shiftAX, partialK, partialB, ldaz, ldbz, ta, txa, txb, txb<1-3>, xmad_ta, xmad_tb

     96-119 :  load0A<0-7>,  load0B<0-3>,  load1B<0-3>,  load2B<0-3>,  load3B<0-3>
    120-129 : track0A<0-1>, track0B<0-1>, track1B<0-1>, track2B<0-1>, track3B<0-1>

    130-137 ~ swapBuf, readAs, readBs, writeAs, writeBs, k, ldb32
    138-144 ~ tid, blkA, blkB, blkZ, writeCs, preds

       0-15 : part0C<0-3>, part1C<0-3>, part2C<0-3>, part3C<0-3>
      64-95 : shuffle_x<0-7>y0, shuffle_x<0-7>y1, shuffle_x<0-7>y2, shuffle_x<0-7>y3
      64-95 : shuffle_x<0-7>y4, shuffle_x<0-7>y5, shuffle_x<0-7>y6, shuffle_x<0-7>y7
      96-99 : loadC<0-3>
    100-103 : b<0-3>
    104-107 : c<0-3>
    108-109 : C<0-1>
    110-137 ~ ldc, ldcz, cx, cx<1-3>, cy, ci, xmad_c, ldc8, readCs, alpha, beta, flags, tid15, tid16

</REGISTER_MAPPING>

--:-:1:-:1      S2R tid,  SR_TID.X;
--:-:2:-:1      S2R blkA, SR_CTAID.Y;
--:-:3:-:1      S2R blkB, SR_CTAID.Z;
--:-:4:-:1      S2R blkZ, SR_CTAID.X;

<SCHEDULE_BLOCK>
--:-:-:-:1      MOV k,    param_k;
--:-:-:-:1      MOV lda,  param_lda;
--:-:-:-:1      MOV ldb,  param_ldb;
--:-:-:-:1      MOV ldaz, param_ldaz;
--:-:-:-:1      MOV ldbz, param_ldbz;
--:-:-:-:1      SHL ldb8,  ldb, 3;
--:-:-:-:1      SHL ldb32, ldb, 6;

--:-:-:-:1      STS.128 [addr_zero], RZ;
[+ join '', map sprintf("--:-:-:-:1      LDS.U.128 czero%02d, [addr_zero];\n", $_ * 4), 0..15; +]

// tidAX   = tid >> 2
// tidAY   = (tid & 3) << 3
// shiftAX = (tid & 3) << 3
01:-:-:-:1      SHR.U32 tidAX,   tid,   2;
--:-:-:-:1      LOP.AND tidAY,   tid,   3;
--:-:-:-:1      SHL     shiftAX, tidAY, 3;
--:-:-:-:1      SHL     tidAY,   tidAY, 3;

// tidBX   = (tid & 15) << 2
// tidBY   = tid >> 4
01:-:-:-:1      LOP.AND tidBX, tid,  15;
--:-:-:-:1      SHL     tidBX, tidBX, 2;
--:-:-:-:1      SHR.U32 tidBY, tid,   4;

--:-:-:-:1      IADD tidBY8,  tidBY, 8;
--:-:-:-:1      IADD tidBY16, tidBY, 16;
--:-:-:-:1      IADD tidBY24, tidBY, 24;

// trackA += ((blkA*32 + tidAX) * lda + tidAY) * 2
02:-:-:-:1      ISCADD   txa, blkA, tidAX, 5;
--:-:-:-:1      XMAD.LO  ta, lda,  txa, tidAY, xmad_ta;
08:-:-:-:1      XMAD.LO2 ta, ldaz, blkZ, ta;
--:-:-:-:1      LEA      track0A0.CC, ta, param_A[0],     1;
--:-:-:-:1      LEA.HI.X track0A1,    ta, param_A[1], RZ, 1;

--:-:-:-:1      ISETP.LT.AND P2, PT, txa, param_m, PT;

// trackB += (blkB*64 + tidBX + ldb*tidBY) * 2
04:-:-:-:1      ISCADD   txb, blkB, tidBX, 6;
--:-:-:-:1      XMAD.LO2 tb, ldb,  tidBY, txb;
08:-:-:-:1      XMAD.LO2 tb, ldbz, blkZ,  tb;
--:-:-:-:1      LEA      track0B0.CC, tb, param_B[0],     1;
--:-:-:-:1      LEA.HI.X track0B1,    tb, param_B[1], RZ, 1;
--:-:-:-:1      IADD     tb,  tb,  ldb8;
--:-:-:-:1      LEA      track1B0.CC, tb, param_B[0],     1;
--:-:-:-:1      LEA.HI.X track1B1,    tb, param_B[1], RZ, 1;
--:-:-:-:1      IADD     tb,  tb,  ldb8;
--:-:-:-:1      LEA      track2B0.CC, tb, param_B[0],     1;
--:-:-:-:1      LEA.HI.X track2B1,    tb, param_B[1], RZ, 1;
--:-:-:-:1      IADD     tb,  tb,  ldb8;
--:-:-:-:1      LEA      track3B0.CC, tb, param_B[0],     1;
--:-:-:-:1      LEA.HI.X track3B1,    tb, param_B[1], RZ, 1;

--:-:-:-:1      ISETP.LT.AND P3, PT, txb, param_n, PT;
[+
    our $vec;
    return $vec ? '' : q{
--:-:-:-:1      IADD txb1, txb, 1;
--:-:-:-:1      IADD txb2, txb, 2;
--:-:-:-:1      IADD txb3, txb, 3;
--:-:-:-:1      ISETP.LT.AND P4, PT, txb1, param_n, PT;
--:-:-:-:1      ISETP.LT.AND P5, PT, txb2, param_n, PT;
--:-:-:-:1      ISETP.LT.AND P6, PT, txb3, param_n, PT;
    };
+]
--:-:-:-:1      P2R preds, PR, RZ, 0x7c;

// writeAs = (tidAY*32 + tidAX + shiftAX) * 4
--:-:-:-:1      ISCADD writeAs, tidAY, tidAX, 5;
--:-:-:-:1      IADD   writeAs, writeAs, shiftAX;
--:-:-:-:1      SHL    writeAs, writeAs, 2;

// writeBs = (tidBY*64 + tidBX) * 4
--:-:-:-:1      ISCADD writeBs, tidBY, tidBX, 6;
--:-:-:-:1      ISCADD writeBs, writeBs, 4x<szShareA>, 2;

// readAs = (((tid & 16) >> 2) | (tid & 1)) << 4
--:-:-:-:1      LOP.AND tid1,   tid,    1;
--:-:-:-:1      LOP.AND readAs, tid,   16;
--:-:-:-:1      SHR.U32 readAs, readAs, 3;
--:-:-:-:1      LOP.OR  readAs, readAs, tid1;
--:-:-:-:1      SHL     readAs, readAs, 4;

// readBs  = (((tid >> 1) & 7) << 4
--:-:-:-:1      BFE.U32 readBs, tid,    0x301; // 2 bits at position 1
--:-:-:-:1      SHL     readBs, readBs, 4;

// Each tile has 32 threads so this is an index into the 4 tiles (at bit position 5)
// tid32 = tid & -32
--:-:-:-:1      LOP.AND tid32, tid, -32;

// Write out the 4 groups of 32 rows 16 at a time
// writeCs = (readAs + tid32/2*4) * 64 + readBs
--:-:-:-:1      ISCADD writeCs, tid32,   readAs, 1;
--:-:-:-:1      ISCADD writeCs, writeCs, readBs, 6;

// Each block of 32 threads works on 8 lines,
// readAs is also shifted over by 8 for each group of 32 threads
// readAs += tid32/4 * 32 * 4 + tid32/4 * 4
// readBs += tid32/4 * 64 * 4 + 4x<szShareA>
--:-:-:-:1      ISCADD readAs, tid32,  readAs, 5;
--:-:-:-:1      ISCADD readBs, tid32,  readBs, 6;
--:-:-:-:1      IADD   readAs, tid32,  readAs;
--:-:-:-:1      IADD   readBs, readBs, 4x<szShareA>;

--:-:-:-:1      MOV32I swapBuf, 4x<szShareA + szShareB>;

// If k is not a multiple of 32 we want to grab the partial amount on the first fetch.
// If it is a multiple of 32 then make a full 32 line fetch.
--:-:-:-:1      LOP.AND.Z P0, partialK, k, 31;
--:-:-:-:1  @P0 MOV partialK, 32;
--:-:-:-:1      IADD k, k, -partialK;
[+
    our $vec;
    return $vec ? q{
--:-:-:-:1      ISETP.LT.AND P2, PT, tidAY,   partialK, P2;
--:-:-:-:1      ISETP.LT.AND P4, PT, tidBY8,  partialK, P3;
--:-:-:-:1      ISETP.LT.AND P5, PT, tidBY16, partialK, P3;
--:-:-:-:1      ISETP.LT.AND P6, PT, tidBY24, partialK, P3;
--:-:-:-:1      ISETP.LT.AND P3, PT, tidBY,   partialK, P3;
<ORDERED>
--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
--:-:3:-:1  @P3 LDG.E.CI.64  load0B, [track0B];
--:-:4:-:1  @P4 LDG.E.CI.64  load1B, [track1B];
--:-:5:-:1  @P5 LDG.E.CI.64  load2B, [track2B];
--:-:6:-:1  @P6 LDG.E.CI.64  load3B, [track3B];
</ORDERED>
<ORDERED>
--:-:-:-:1 @!P2 LDS.U.128 load0A, [addr_zero];
--:-:-:-:1 @!P3 LDS.U.64  load0B, [addr_zero];
--:-:-:-:1 @!P4 LDS.U.64  load1B, [addr_zero];
--:-:-:-:1 @!P5 LDS.U.64  load2B, [addr_zero];
--:-:1:-:1 @!P6 LDS.U.64  load3B, [addr_zero];
</ORDERED>
    } : q{

--:-:-:-:1      IADD tidAY1, tidAY, 1;
--:-:-:-:1      IADD tidAY2, tidAY, 2;
--:-:-:-:1      IADD tidAY3, tidAY, 3;
--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY,  partialK, P2;
--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY1, partialK, P2;
--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY2, partialK, P2;
--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY3, partialK, P2;
<ORDERED>
--:-:-:-:1  @P3 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
--:-:-:-:1  @P4 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
--:-:-:-:1  @P5 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
--:-:2:-:1  @P6 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
</ORDERED>
--:-:-:-:1 @!P3 MOV load0A0, RZ;
--:-:-:-:1 @!P4 MOV load0A1, RZ;
--:-:-:-:1 @!P5 MOV load0A2, RZ;
--:-:-:-:1 @!P6 MOV load0A3, RZ;

--:-:-:-:1      IADD tidAY,  tidAY,  4;
--:-:-:-:1      IADD tidAY1, tidAY1, 4;
--:-:-:-:1      IADD tidAY2, tidAY2, 4;
--:-:-:-:1      IADD tidAY3, tidAY3, 4;
--:-:-:-:1      ISETP.LT.AND P3, PT, tidAY,  partialK, P2;
--:-:-:-:1      ISETP.LT.AND P4, PT, tidAY1, partialK, P2;
--:-:-:-:1      ISETP.LT.AND P5, PT, tidAY2, partialK, P2;
--:-:-:-:1      ISETP.LT.AND P6, PT, tidAY3, partialK, P2;
<ORDERED>
--:-:-:-:1  @P3 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
--:-:-:-:1  @P4 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
--:-:-:-:1  @P5 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
--:-:2:-:1  @P6 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];
</ORDERED>
--:-:-:-:1 @!P3 MOV load0A4, RZ;
--:-:-:-:1 @!P4 MOV load0A5, RZ;
--:-:-:-:1 @!P5 MOV load0A6, RZ;
--:-:-:-:1 @!P6 MOV load0A7, RZ;


--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY, partialK, PT;
--:-:-:-:1  @P0 R2P PR, preds, 0x78;
--:-:-:-:1 @!P0 R2P PR, RZ,    0x78;
<ORDERED>
--:-:-:-:1  @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
--:-:-:-:1  @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
--:-:-:-:1  @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
--:-:3:-:1  @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];
</ORDERED>
--:-:-:-:1 @!P3 MOV load0B0, RZ;
--:-:-:-:1 @!P4 MOV load0B1, RZ;
--:-:-:-:1 @!P5 MOV load0B2, RZ;
--:-:-:-:1 @!P6 MOV load0B3, RZ;

--:-:-:-:1      ISETP.LT.AND P1, PT, tidBY8, partialK, PT;
--:-:-:-:1  @P1 R2P PR, preds, 0x78;
--:-:-:-:1 @!P1 R2P PR, RZ,    0x78;
<ORDERED>
--:-:-:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
--:-:-:-:1  @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
--:-:-:-:1  @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
--:-:4:-:1  @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];
</ORDERED>
--:-:-:-:1 @!P3 MOV load1B0, RZ;
--:-:-:-:1 @!P4 MOV load1B1, RZ;
--:-:-:-:1 @!P5 MOV load1B2, RZ;
--:-:-:-:1 @!P6 MOV load1B3, RZ;

--:-:-:-:1      ISETP.LT.AND P2, PT, tidBY16, partialK, PT;
--:-:-:-:1  @P2 R2P PR, preds, 0x78;
--:-:-:-:1 @!P2 R2P PR, RZ,    0x78;
<ORDERED>
--:-:-:-:1  @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
--:-:-:-:1  @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
--:-:-:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
--:-:5:-:1  @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];
</ORDERED>
--:-:-:-:1 @!P3 MOV load2B0, RZ;
--:-:-:-:1 @!P4 MOV load2B1, RZ;
--:-:-:-:1 @!P5 MOV load2B2, RZ;
--:-:-:-:1 @!P6 MOV load2B3, RZ;

--:-:-:-:1      ISETP.LT.AND P0, PT, tidBY24, partialK, PT;
--:-:-:-:1  @P0 R2P PR, preds, 0x78;
--:-:-:-:1 @!P0 R2P PR, RZ,    0x78;
<ORDERED>
--:-:-:-:1  @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
--:-:-:-:1  @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
--:-:-:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
--:-:6:-:1  @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
</ORDERED>
--:-:-:-:1 @!P3 MOV load3B0, RZ;
--:-:-:-:1 @!P4 MOV load3B1, RZ;
--:-:-:-:1 @!P5 MOV load3B2, RZ;
--:-:-:-:1 @!P6 MOV load3B3, RZ;

    };
+]
// partialB = partialK * ldb
--:-:-:-:1      XMAD.LO2 partialB, ldb, partialK, RZ;

--:-:-:-:1      ISETP.GE.AND P1, PT, k, 32, PT;
--:-:-:-:1      IADD k, k, -32;
--:-:-:-:1  @P1 R2P PR, preds, 0x7c;
--:-:-:-:1 @!P1 R2P PR, RZ, 0x7c;
</SCHEDULE_BLOCK>

[+
    our $vec;
    return $vec ? q{
03:-:-:-:1      F2F.F32.F16 load0A7, load0A3.H1;
--:-:-:-:1      F2F.F32.F16 load0A6, load0A3.H0;
--:-:-:-:1      F2F.F32.F16 load0A5, load0A2.H1;
--:-:1:-:1      F2F.F32.F16 load0A4, load0A2.H0;
--:-:-:-:1      F2F.F32.F16 load0A3, load0A1.H1;
--:-:-:-:1      F2F.F32.F16 load0A2, load0A1.H0;
--:-:-:-:1      F2F.F32.F16 load0A1, load0A0.H1;
--:-:2:-:1      F2F.F32.F16 load0A0, load0A0.H0;
    } : q{
02:-:-:-:1      F2F.F32.F16 load0A7, load0A7;
--:-:-:-:1      F2F.F32.F16 load0A6, load0A6;
--:-:-:-:1      F2F.F32.F16 load0A5, load0A5;
--:-:1:-:1      F2F.F32.F16 load0A4, load0A4;
--:-:-:-:1      F2F.F32.F16 load0A3, load0A3;
--:-:-:-:1      F2F.F32.F16 load0A2, load0A2;
--:-:-:-:1      F2F.F32.F16 load0A1, load0A1;
--:-:2:-:1      F2F.F32.F16 load0A0, load0A0;
    };
+]
--:-:-:-:0      LEA track0A0.CC, partialK, track0A0, 1;
01:-:-:-:1      STS [writeAs + 4x<7*32>], load0A7;
--:-:-:-:1      STS [writeAs + 4x<6*32>], load0A6;
--:-:-:-:1      STS [writeAs + 4x<5*32>], load0A5;
--:-:-:-:1      STS [writeAs + 4x<4*32>], load0A4;
02:-:-:-:1      STS [writeAs + 4x<3*32>], load0A3;
--:-:-:-:1      STS [writeAs + 4x<2*32>], load0A2;
--:-:-:-:1      STS [writeAs + 4x<1*32>], load0A1;
--:-:-:-:1      STS [writeAs + 4x<0*32>], load0A0;
--:-:-:-:0      IADD.X track0A1,    track0A1, RZ;

[+
    our $vec;
    return $vec ? q{
04:-:-:-:1      F2F.F32.F16 load0B3, load0B1.H1;
--:-:-:-:1      F2F.F32.F16 load0B2, load0B1.H0;
--:-:-:-:1      F2F.F32.F16 load0B1, load0B0.H1;
--:-:3:-:1      F2F.F32.F16 load0B0, load0B0.H0;

08:-:-:-:1      F2F.F32.F16 load1B3, load1B1.H1;
--:-:-:-:1      F2F.F32.F16 load1B2, load1B1.H0;
--:-:-:-:1      F2F.F32.F16 load1B1, load1B0.H1;
--:-:4:-:1      F2F.F32.F16 load1B0, load1B0.H0;

10:-:-:-:1      F2F.F32.F16 load2B3, load2B1.H1;
--:-:-:-:1      F2F.F32.F16 load2B2, load2B1.H0;
--:-:-:-:1      F2F.F32.F16 load2B1, load2B0.H1;
--:-:5:-:1      F2F.F32.F16 load2B0, load2B0.H0;

20:-:-:-:1      F2F.F32.F16 load3B3, load3B1.H1;
--:-:-:-:1      F2F.F32.F16 load3B2, load3B1.H0;
--:-:-:-:1      F2F.F32.F16 load3B1, load3B0.H1;
--:-:6:-:1      F2F.F32.F16 load3B0, load3B0.H0;
    } : q{
04:-:-:-:1      F2F.F32.F16 load0B0, load0B0;
--:-:-:-:1      F2F.F32.F16 load0B1, load0B1;
--:-:-:-:1      F2F.F32.F16 load0B2, load0B2;
--:-:3:-:1      F2F.F32.F16 load0B3, load0B3;

08:-:-:-:1      F2F.F32.F16 load1B0, load1B0;
--:-:-:-:1      F2F.F32.F16 load1B1, load1B1;
--:-:-:-:1      F2F.F32.F16 load1B2, load1B2;
--:-:4:-:1      F2F.F32.F16 load1B3, load1B3;

10:-:-:-:1      F2F.F32.F16 load2B0, load2B0;
--:-:-:-:1      F2F.F32.F16 load2B1, load2B1;
--:-:-:-:1      F2F.F32.F16 load2B2, load2B2;
--:-:5:-:1      F2F.F32.F16 load2B3, load2B3;

20:-:-:-:1      F2F.F32.F16 load3B0, load3B0;
--:-:-:-:1      F2F.F32.F16 load3B1, load3B1;
--:-:-:-:1      F2F.F32.F16 load3B2, load3B2;
--:-:6:-:1      F2F.F32.F16 load3B3, load3B3;
    };
+]

--:-:-:-:0      LEA track0B0.CC, partialB, track0B0, 1;
04:-:-:-:6      STS.128 [writeBs + 4x<0*64>], load0B;
--:-:-:-:1      IADD.X track0B1, track0B1, RZ;

--:-:-:-:0      LEA track1B0.CC, partialB, track1B0, 1;
08:-:-:-:6      STS.128 [writeBs + 4x<8*64>], load1B;
--:-:-:-:1      IADD.X track1B1, track1B1, RZ;

--:-:-:-:0      LEA track2B0.CC, partialB, track2B0, 1;
10:-:-:-:6      STS.128 [writeBs + 4x<16*64>], load2B;
--:-:-:-:1      IADD.X track2B1, track2B1, RZ;

--:-:-:-:0      LEA track3B0.CC, partialB, track3B0, 1;
20:-:-:-:6      STS.128 [writeBs + 4x<24*64>], load3B;
--:-:-:-:0      IADD.X track3B1, track3B1, RZ;

--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:1      IADD writeBs, writeBs, swapBuf;
--:-:-:-:1      IADD writeAs, writeAs, swapBuf;
--:-:-:-:0      IADD swapBuf, RZ, -swapBuf;

--:-:-:-:1      LDS.U.128 j0Ay0, [readAs + 4x<0*32 + 00>];
--:-:-:-:1      LDS.U.128 j0Bx0, [readBs + 4x<0*64 + 00>];
--:-:-:-:1      LDS.U.128 j0Ay4, [readAs + 4x<0*32 + 16>];
--:-:1:-:1      LDS.U.128 j0Bx4, [readBs + 4x<0*64 + 32>];

[+
    our $vec;
    return $vec ? q{
--:-:2:-:1  @P2 LDG.E.CI.128 load0A, [track0A];
--:-:3:-:1  @P3 LDG.E.CI.64  load0B, [track0B];
--:-:4:-:1  @P3 LDG.E.CI.64  load1B, [track1B];
--:-:5:-:1  @P3 LDG.E.CI.64  load2B, [track2B];
--:-:6:-:1  @P3 LDG.E.CI.64  load3B, [track3B];
    } : q{
--:-:-:-:1  @P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];
--:-:-:-:1  @P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];
--:-:-:-:1  @P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];
--:-:-:-:1  @P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];
--:-:-:-:1  @P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];
--:-:-:-:1  @P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];
--:-:-:-:1  @P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];
--:-:2:-:1  @P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];

--:-:-:-:1  @P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];
--:-:-:-:1  @P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];
--:-:-:-:1  @P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];
--:-:3:-:1  @P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];

--:-:-:-:1  @P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];
--:-:-:-:1  @P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];
--:-:-:-:1  @P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];
--:-:4:-:1  @P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];

--:-:-:-:1  @P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];
--:-:-:-:1  @P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];
--:-:-:-:1  @P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];
--:-:5:-:1  @P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];

--:-:-:-:1  @P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];
--:-:-:-:1  @P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];
--:-:-:-:1  @P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];
--:-:6:-:1  @P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];
    };
+]

LOOP:

[+
    our $vec;
    our %insert =
    (
        j0c8   => "--:-:-:-:1      ISETP.GE.AND P0, PT, k, RZ, PT;\n",
        j0c10  => "--:-:-:-:1      ISETP.GE.AND P1, PT, k, 32, PT;\n" .
                  "--:-:-:-:1      IADD k, k, -32;\n",

        j0c23  => "--:-:-:-:1  \@P1 R2P PR, preds, 0x7c;\n",
        j0c24  => "--:-:-:-:1 \@!P1 R2P PR, RZ,    0x7c;\n",

        j2c32  => "--:-:-:-:1  \@P2 IADD   track0A0.CC, track0A0, 2x<32>;\n",
        j2c37  => "--:-:-:-:1  \@P2 IADD.X track0A1,    track0A1, RZ;\n",
        j3c32  => "--:-:-:-:1  \@P3 IADD   track0B0.CC, track0B0, ldb32;\n",
        j3c37  => "--:-:-:-:1  \@P3 IADD.X track0B1,    track0B1, RZ;\n",
        j4c32  => "--:-:-:-:1  \@P3 IADD   track1B0.CC, track1B0, ldb32;\n",
        j4c37  => "--:-:-:-:1  \@P3 IADD.X track1B1,    track1B1, RZ;\n",
        j5c32  => "--:-:-:-:1  \@P3 IADD   track2B0.CC, track2B0, ldb32;\n",
        j5c37  => "--:-:-:-:1  \@P3 IADD.X track2B1,    track2B1, RZ;\n",
        j6c32  => "--:-:-:-:1  \@P3 IADD   track3B0.CC, track3B0, ldb32;\n",
        j6c37  => "--:-:-:-:1  \@P3 IADD.X track3B1,    track3B1, RZ;\n",

        j6c63  => "--:-:-:-:5      BAR.SYNC 0;\n" .
                  "--:-:-:-:1  \@P0 IADD readAs,  readAs, -swapBuf;\n" .
                  "--:-:-:-:1  \@P0 IADD readBs,  readBs, -swapBuf;\n" .
                  "--:-:-:-:1  \@P0 IADD writeAs, writeAs, swapBuf;\n" .
                  "--:-:-:-:1  \@P0 IADD writeBs, writeBs, swapBuf;\n" .
                  "--:-:-:-:1  \@P0 IADD swapBuf, RZ,     -swapBuf;\n",

        j2c16  => "02:-:-:-:1  \@P0 STS [writeAs + 4x<7*32>], load0A7;\n",
        j2c18  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<6*32>], load0A6;\n",
        j2c20  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<5*32>], load0A5;\n",
        j2c22  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<4*32>], load0A4;\n",
        j2c24  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<3*32>], load0A3;\n",
        j2c26  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<2*32>], load0A2;\n",
        j2c28  => "--:-:-:-:1  \@P0 STS [writeAs + 4x<1*32>], load0A1;\n",
        j2c30  => "--:2:-:-:1  \@P0 STS [writeAs + 4x<0*32>], load0A0;\n",

        j3c16  => "04:3:-:-:1  \@P0 STS.128 [writeBs + 4x< 0*64>], load0B;\n",
        j4c16  => "08:4:-:-:1  \@P0 STS.128 [writeBs + 4x< 8*64>], load1B;\n",
        j5c16  => "10:5:-:-:1  \@P0 STS.128 [writeBs + 4x<16*64>], load2B;\n",
        j6c16  => "20:6:-:-:1  \@P0 STS.128 [writeBs + 4x<24*64>], load3B;\n",

        ($vec ?
            (
                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A3.H1;\n",
                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A3.H0;\n",
                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A2.H1;\n",
                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A2.H0;\n",
                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A1.H1;\n",
                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A1.H0;\n",
                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A0.H1;\n",
                j1c63 => "--:-:2:-:1  \@P0 F2F.F32.F16 load0A0, load0A0.H0;\n",

                j2c51 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B3, load0B1.H1;\n",
                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B1.H0;\n",
                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B0.H1;\n",
                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load0B0, load0B0.H0;\n",

                j3c51 => "08:-:-:-:1  \@P0 F2F.F32.F16 load1B3, load1B1.H1;\n",
                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B1.H0;\n",
                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B0.H1;\n",
                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load1B0, load1B0.H0;\n",

                j4c51 => "10:-:-:-:1  \@P0 F2F.F32.F16 load2B3, load2B1.H1;\n",
                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B1.H0;\n",
                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B0.H1;\n",
                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load2B0, load2B0.H0;\n",

                j5c51 => "20:-:-:-:1  \@P0 F2F.F32.F16 load3B3, load3B1.H1;\n",
                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B1.H0;\n",
                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B0.H1;\n",
                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load3B0, load3B0.H0;\n",

                j2c61 => "02:-:2:-:1  \@P2 LDG.E.CI.128 load0A, [track0A];\n",
                j3c61 => "04:-:3:-:1  \@P3 LDG.E.CI.64  load0B, [track0B];\n",
                j4c61 => "08:-:4:-:1  \@P3 LDG.E.CI.64  load1B, [track1B];\n",
                j5c61 => "10:-:5:-:1  \@P3 LDG.E.CI.64  load2B, [track2B];\n",
                j6c61 => "20:-:6:-:1  \@P3 LDG.E.CI.64  load3B, [track3B];\n",
            ) :
            (
                j1c35 => "02:-:-:-:1  \@P0 F2F.F32.F16 load0A0, load0A0;\n",
                j1c39 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A1, load0A1;\n",
                j1c43 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A2, load0A2;\n",
                j1c47 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A3, load0A3;\n",
                j1c51 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A4, load0A4;\n",
                j1c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A5, load0A5;\n",
                j1c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0A6, load0A6;\n",
                j1c63 => "--:2:-:-:1  \@P0 F2F.F32.F16 load0A7, load0A7;\n",

                j2c51 => "04:-:-:-:1  \@P0 F2F.F32.F16 load0B0, load0B0;\n",
                j2c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B1, load0B1;\n",
                j2c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load0B2, load0B2;\n",
                j2c63 => "--:-:3:-:1  \@P0 F2F.F32.F16 load0B3, load0B3;\n",

                j3c51 => "08:-:-:-:1  \@P0 F2F.F32.F16 load1B0, load1B0;\n",
                j3c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B1, load1B1;\n",
                j3c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load1B2, load1B2;\n",
                j3c63 => "--:-:4:-:1  \@P0 F2F.F32.F16 load1B3, load1B3;\n",

                j4c51 => "10:-:-:-:1  \@P0 F2F.F32.F16 load2B0, load2B0;\n",
                j4c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B1, load2B1;\n",
                j4c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load2B2, load2B2;\n",
                j4c63 => "--:-:5:-:1  \@P0 F2F.F32.F16 load2B3, load2B3;\n",

                j5c51 => "20:-:-:-:1  \@P0 F2F.F32.F16 load3B0, load3B0;\n",
                j5c55 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B1, load3B1;\n",
                j5c59 => "--:-:-:-:1  \@P0 F2F.F32.F16 load3B2, load3B2;\n",
                j5c63 => "--:-:6:-:1  \@P0 F2F.F32.F16 load3B3, load3B3;\n",

                j2c48 => "02:-:-:-:1  \@P2 LDG.E.CI.U16 load0A0, [track0A + 2x<0>];\n",
                j2c50 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A1, [track0A + 2x<1>];\n",
                j2c52 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A2, [track0A + 2x<2>];\n",
                j2c54 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A3, [track0A + 2x<3>];\n",
                j2c56 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A4, [track0A + 2x<4>];\n",
                j2c58 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A5, [track0A + 2x<5>];\n",
                j2c60 => "--:-:-:-:1  \@P2 LDG.E.CI.U16 load0A6, [track0A + 2x<6>];\n",
                j2c62 => "--:-:2:-:1  \@P2 LDG.E.CI.U16 load0A7, [track0A + 2x<7>];\n",

                j3c56 => "04:-:-:-:1  \@P3 LDG.E.CI.U16 load0B0, [track0B + 2x<0>];\n",
                j3c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load0B1, [track0B + 2x<1>];\n",
                j3c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load0B2, [track0B + 2x<2>];\n",
                j3c62 => "--:-:3:-:1  \@P6 LDG.E.CI.U16 load0B3, [track0B + 2x<3>];\n",

                j4c56 => "08:-:-:-:1  \@P3 LDG.E.CI.U16 load1B0, [track1B + 2x<0>];\n",
                j4c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load1B1, [track1B + 2x<1>];\n",
                j4c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load1B2, [track1B + 2x<2>];\n",
                j4c62 => "--:-:4:-:1  \@P6 LDG.E.CI.U16 load1B3, [track1B + 2x<3>];\n",

                j5c56 => "10:-:-:-:1  \@P3 LDG.E.CI.U16 load2B0, [track2B + 2x<0>];\n",
                j5c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load2B1, [track2B + 2x<1>];\n",
                j5c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load2B2, [track2B + 2x<2>];\n",
                j5c62 => "--:-:5:-:1  \@P6 LDG.E.CI.U16 load2B3, [track2B + 2x<3>];\n",

                j6c56 => "20:-:-:-:1  \@P3 LDG.E.CI.U16 load3B0, [track3B + 2x<0>];\n",
                j6c58 => "--:-:-:-:1  \@P4 LDG.E.CI.U16 load3B1, [track3B + 2x<1>];\n",
                j6c60 => "--:-:-:-:1  \@P5 LDG.E.CI.U16 load3B2, [track3B + 2x<2>];\n",
                j6c62 => "--:-:6:-:1  \@P6 LDG.E.CI.U16 load3B3, [track3B + 2x<3>];\n",
            )
        ),
        j7c63 => "--:-:-:Y:5  \@P0 BRA.U LOOP;\n",
    );
    my @cOrder;
    my @swirl = ([0,2],[1,2],[1,0],[0,0]);
    my @y = (0,1,4,5);
    foreach my $x (0,2,4,6)
    {
        foreach my $y (@y)
        {
            push @cOrder, [$x + $_->[0], $y + $_->[1]] foreach @swirl;
        }
        @y = reverse @y;
    }
    my $out = '';
    foreach my $j (0 .. 7)
    {
        my $odd      = $j & 1;
        my $nOdd     = !$odd + 0;
        my $rsOffset = ($j + 1) % 8;
        my $rsPred   = $j == 7 ? '@P0' : '   ';

        $insert{"j${j}c0"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy0, [readAs + 4x<%d*32 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c2"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dBx0, [readBs + 4x<%d*64 + 00>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c4"} = sprintf "--:-:-:-:1  %s LDS.U.128 j%dAy4, [readAs + 4x<%d*32 + 16>];\n", $rsPred, $nOdd, $rsOffset;
        $insert{"j${j}c6"} = sprintf "--:-:1:-:1  %s LDS.U.128 j%dBx4, [readBs + 4x<%d*64 + 32>];\n", $rsPred, $nOdd, $rsOffset;

        foreach my $c (0 .. 63)
        {
            my ($x,$y) = @{$cOrder[$c]};

            my $ins    = $insert{"j${j}c$c"} || '';

            my $stall  = $ins =~ /LDS|I2I|I2F|F2I|F2F|LDG|STS|BAR|BRA/ ? 0 : 1;

            my $yield  = $c == 32 && $stall ? 'Y' : '-';

            my $wait   = $c == 0 ? '01' : '--';

            my $ctrl   = "$wait:-:-:$yield:$stall";

            $out .= sprintf "%s      FFMA cx%dy%d, j%dBx%d, j%dAy%d, cx%dy%d;\n%s", $ctrl,  $x,$y,  $odd,$x,  $odd,$y,  $x,$y,  $ins;
        }
    }
    return $out;
+]

<SCHEDULE_BLOCK>
--:-:-:-:1      MOV alpha, param_alpha;
--:-:-:-:1      MOV beta,  param_beta;
--:-:-:-:1      MOV flags, param_flags;

// readCs = ((tid & 15) * 4 + (tid / 16) * 64) * 4
--:-:-:-:1      LOP.AND tid15, tid,   15;
--:-:-:-:1      SHR.U32 tid16, tid,    4;
--:-:-:-:1      SHL     tid15, tid15,  2;
--:-:-:-:1      ISCADD readCs, tid16,  tid15, 6;
--:-:-:-:1      SHL    readCs, readCs, 2;

// cx = blkB*64 + tid15;
--:-:-:-:1      ISCADD cx, blkB, tid15, 6;
--:-:-:-:1      IADD   cx1, cx, 1;
--:-:-:-:1      IADD   cx2, cx, 2;
--:-:-:-:1      IADD   cx3, cx, 3;

// cy = blkA*32 + tid16
--:-:-:-:1      ISCADD cy, blkA, tid16, 5;

// C += (cy*ldc + cx) * 2;
--:-:-:-:1      MOV  ldc,  param_ldc;
--:-:-:-:1      MOV  ldcz, param_ldcz;
--:-:-:-:1      SHL  ldc8, ldc, 4;

--:-:-:-:1      XMAD.LO  ci, cy, ldc, cx, xmad_c;
--:-:-:-:1      XMAD.LO2 ci, ldcz, blkZ, ci;
--:-:-:-:1      LEA      C0.CC, ci, param_C[0],     1;
--:-:-:-:1      LEA.HI.X C1,    ci, param_C[1], RZ, 1;

// P0 = cx < n
--:-:-:-:1      ISETP.LT.AND P0, PT, cx,  param_n, PT;
--:-:-:-:1      ISETP.LT.AND P1, PT, cx1, param_n, PT;
--:-:-:-:1      ISETP.LT.AND P2, PT, cx2, param_n, PT;
--:-:-:-:1      ISETP.LT.AND P3, PT, cx3, param_n, PT;
--:-:-:-:1      P2R preds, PR, RZ, 0x0f;

// P4 = cy < m
--:-:-:-:1      ISETP.LT.AND P4, PT, cy, param_m, PT;

// P5 = beta != 0 && P4
--:-:-:-:1      ISETP.NE.AND P5, PT, beta, RZ, P4;

// P6 = Apply relu
--:-:-:-:1      LOP.AND.NZ P6, RZ, flags, 2;

// Init beta preds
--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;

</SCHEDULE_BLOCK>

--:-:-:-:1      FMUL shuffle_x0y0, cx0y0, alpha;
--:-:-:-:1      FMUL shuffle_x1y0, cx1y0, alpha;
--:-:-:-:1      FMUL shuffle_x2y0, cx2y0, alpha;
--:-:-:-:1      FMUL shuffle_x3y0, cx3y0, alpha;
--:-:-:-:1      FMUL shuffle_x4y0, cx4y0, alpha;
--:-:-:-:1      FMUL shuffle_x5y0, cx5y0, alpha;
--:-:-:-:1      FMUL shuffle_x6y0, cx6y0, alpha;
--:-:-:-:0      FMUL shuffle_x7y0, cx7y0, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y0;
--:-:-:-:1      FMUL shuffle_x0y1, cx0y1, alpha;
--:-:-:-:1      FMUL shuffle_x1y1, cx1y1, alpha;
--:-:-:-:1      FMUL shuffle_x2y1, cx2y1, alpha;
--:-:-:-:0      FMUL shuffle_x3y1, cx3y1, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y0;
--:-:-:-:1      FMUL shuffle_x4y1, cx4y1, alpha;
--:-:-:-:1      FMUL shuffle_x5y1, cx5y1, alpha;
--:-:-:-:1      FMUL shuffle_x6y1, cx6y1, alpha;
--:-:-:-:0      FMUL shuffle_x7y1, cx7y1, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y1;
--:-:-:-:1      FMUL shuffle_x0y2, cx0y2, alpha;
--:-:-:-:1      FMUL shuffle_x1y2, cx1y2, alpha;
--:-:-:-:1      FMUL shuffle_x2y2, cx2y2, alpha;
--:-:-:-:0      FMUL shuffle_x3y2, cx3y2, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y1;
--:-:-:-:1      FMUL shuffle_x4y2, cx4y2, alpha;
--:-:-:-:1      FMUL shuffle_x5y2, cx5y2, alpha;
--:-:-:-:1      FMUL shuffle_x6y2, cx6y2, alpha;
--:-:-:-:0      FMUL shuffle_x7y2, cx7y2, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y2;
--:-:-:-:1      FMUL shuffle_x0y3, cx0y3, alpha;
--:-:-:-:1      FMUL shuffle_x1y3, cx1y3, alpha;
--:-:-:-:1      FMUL shuffle_x2y3, cx2y3, alpha;
--:-:-:-:0      FMUL shuffle_x3y3, cx3y3, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y2;
--:-:-:-:1      FMUL shuffle_x4y3, cx4y3, alpha;
--:-:-:-:1      FMUL shuffle_x5y3, cx5y3, alpha;
--:-:-:-:1      FMUL shuffle_x6y3, cx6y3, alpha;
--:-:-:-:0      FMUL shuffle_x7y3, cx7y3, alpha;
--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y3;
--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y3;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5      CAL STORE_C;
--:-:-:-:5      CAL STORE_C;

--:-:-:-:1      FMUL shuffle_x0y4, cx0y4, alpha;
--:-:-:-:1      FMUL shuffle_x1y4, cx1y4, alpha;
--:-:-:-:1      FMUL shuffle_x2y4, cx2y4, alpha;
--:-:-:-:1      FMUL shuffle_x3y4, cx3y4, alpha;
--:-:-:-:1      FMUL shuffle_x4y4, cx4y4, alpha;
--:-:-:-:1      FMUL shuffle_x5y4, cx5y4, alpha;
--:-:-:-:0      FMUL shuffle_x6y4, cx6y4, alpha;
--:-:-:-:5      BAR.SYNC 0;
--:-:-:-:0      FMUL shuffle_x7y4, cx7y4, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 00>], shuffle_x0y4;
--:-:-:-:1      FMUL shuffle_x0y5, cx0y5, alpha;
--:-:-:-:1      FMUL shuffle_x1y5, cx1y5, alpha;
--:-:-:-:1      FMUL shuffle_x2y5, cx2y5, alpha;
--:-:-:-:0      FMUL shuffle_x3y5, cx3y5, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<0*64 + 32>], shuffle_x4y4;
--:-:-:-:1      FMUL shuffle_x4y5, cx4y5, alpha;
--:-:-:-:1      FMUL shuffle_x5y5, cx5y5, alpha;
--:-:-:-:1      FMUL shuffle_x6y5, cx6y5, alpha;
--:-:-:-:0      FMUL shuffle_x7y5, cx7y5, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 00>], shuffle_x0y5;
--:-:-:-:1      FMUL shuffle_x0y6, cx0y6, alpha;
--:-:-:-:1      FMUL shuffle_x1y6, cx1y6, alpha;
--:-:-:-:1      FMUL shuffle_x2y6, cx2y6, alpha;
--:-:-:-:0      FMUL shuffle_x3y6, cx3y6, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<1*64 + 32>], shuffle_x4y5;
--:-:-:-:1      FMUL shuffle_x4y6, cx4y6, alpha;
--:-:-:-:1      FMUL shuffle_x5y6, cx5y6, alpha;
--:-:-:-:1      FMUL shuffle_x6y6, cx6y6, alpha;
--:-:-:-:0      FMUL shuffle_x7y6, cx7y6, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 00>], shuffle_x0y6;
--:-:-:-:1      FMUL shuffle_x0y7, cx0y7, alpha;
--:-:-:-:1      FMUL shuffle_x1y7, cx1y7, alpha;
--:-:-:-:1      FMUL shuffle_x2y7, cx2y7, alpha;
--:-:-:-:0      FMUL shuffle_x3y7, cx3y7, alpha;
--:-:-:-:1      STS.128 [writeCs+4x<2*64 + 32>], shuffle_x4y6;
--:-:-:-:1      FMUL shuffle_x4y7, cx4y7, alpha;
--:-:-:-:1      FMUL shuffle_x5y7, cx5y7, alpha;
--:-:-:-:1      FMUL shuffle_x6y7, cx6y7, alpha;
--:-:-:-:0      FMUL shuffle_x7y7, cx7y7, alpha;
--:-:-:-:4      STS.128 [writeCs+4x<3*64 + 00>], shuffle_x0y7;
--:-:-:-:1      STS.128 [writeCs+4x<3*64 + 32>], shuffle_x4y7;
--:-:-:-:5      BAR.SYNC 0;

--:-:-:-:5      CAL STORE_C;
--:-:-:-:5      CAL STORE_C;

--:-:-:-:5      EXIT;

STORE_C:

[+
    our $vec;
    return $vec ? q{
--:-:1:-:1  @P0 LDG.E.64 loadC, [C];
    } : q{
--:-:-:-:0 @!P0 MOV loadC0, RZ;
--:-:-:-:1  @P0 LDG.E.CI.U16 loadC0, [C + 2x<0>];
--:-:-:-:0 @!P1 MOV loadC1, RZ;
--:-:-:-:1  @P1 LDG.E.CI.U16 loadC1, [C + 2x<1>];
--:-:-:-:0 @!P2 MOV loadC2, RZ;
--:-:-:-:1  @P2 LDG.E.CI.U16 loadC2, [C + 2x<2>];
--:-:-:-:0 @!P3 MOV loadC3, RZ;
--:-:1:-:1  @P3 LDG.E.CI.U16 loadC3, [C + 2x<3>];
    };
+]

// Restore output preds
--:-:-:-:1  @P4 R2P PR, preds, 0x0f;
--:-:-:-:1 @!P4 R2P PR, RZ,    0x0f;

--:-:-:-:1      LDS.U.128 part0C, [readCs + 4x< 0*64>];
--:-:2:-:1      LDS.U.128 part1C, [readCs + 4x<16*64>];
--:-:-:-:1      LDS.U.128 part2C, [readCs + 4x<32*64>];
--:-:3:-:1      LDS.U.128 part3C, [readCs + 4x<48*64>];

<SCHEDULE_BLOCK>
02:-:-:-:1  @P0 FADD part0C0, part0C0, part1C0;
--:-:-:-:1  @P1 FADD part0C1, part0C1, part1C1;
--:-:-:-:1  @P2 FADD part0C2, part0C2, part1C2;
--:-:-:-:1  @P3 FADD part0C3, part0C3, part1C3;

04:-:-:-:1  @P0 FADD part2C0, part2C0, part3C0;
--:-:-:-:1  @P1 FADD part2C1, part2C1, part3C1;
--:-:-:-:1  @P2 FADD part2C2, part2C2, part3C2;
--:-:-:-:1  @P3 FADD part2C3, part2C3, part3C3;

--:-:-:-:1  @P0 FADD c0, part0C0, part2C0;
--:-:-:-:1  @P1 FADD c1, part0C1, part2C1;
--:-:-:-:1  @P2 FADD c2, part0C2, part2C2;
--:-:-:-:1  @P3 FADD c3, part0C3, part2C3;
</SCHEDULE_BLOCK>

--:-:-:-:0      IADD cy, cy, 8;

[+
    our $vec;
    return $vec ? q{
01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0.H0;
--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC0.H1;
--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC1.H0;
--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC1.H1;
    } : q{
01:-:1:-:1  @P5 F2F.F32.F16 b0, loadC0;
--:-:2:-:1  @P5 F2F.F32.F16 b1, loadC1;
--:-:3:-:1  @P5 F2F.F32.F16 b2, loadC2;
--:-:4:-:1  @P5 F2F.F32.F16 b3, loadC3;
    };
+]

01:-:-:-:1  @P5 FFMA c0, b0, beta, c0;
02:-:-:-:1  @P5 FFMA c1, b1, beta, c1;
04:-:-:-:1  @P5 FFMA c2, b2, beta, c2;
08:-:-:-:3  @P5 FFMA c3, b3, beta, c3;

--:-:-:-:1  @P6 FMNMX c0, c0, RZ, !PT;
--:-:-:-:1  @P6 FMNMX c1, c1, RZ, !PT;
--:-:-:-:1  @P6 FMNMX c2, c2, RZ, !PT;
--:-:-:-:1  @P6 FMNMX c3, c3, RZ, !PT;

--:-:-:-:0      ISETP.LT.AND P5, PT, cy, param_m, P5;

--:-:1:-:1  @P0 F2F.F16.F32 c0, c0;
--:-:2:-:1  @P1 F2F.F16.F32 c1, c1;

--:-:-:-:0      ISETP.LT.AND P4, PT, cy, param_m, PT;

--:-:3:-:1  @P2 F2F.F16.F32 c2, c2;

--:-:-:-:0      LOP.XOR readCs, readCs, 4x<8*64>;

--:-:4:-:1  @P3 F2F.F16.F32 c3, c3;

[+
    our $vec;
    return $vec ? q{
03:-:-:-:2  @P0 BFI c0, c1, 0x1010, c0;
0c:-:-:-:2  @P0 BFI c1, c3, 0x1010, c2;

--:1:-:-:1  @P0 STG.E.CG.64 [C], c;
    } : q{
01:-:-:-:1  @P0 STG.E.U16 [C + 2x<0>], c0;
02:-:-:-:1  @P1 STG.E.U16 [C + 2x<1>], c1;
04:-:-:-:1  @P2 STG.E.U16 [C + 2x<2>], c2;
08:1:-:-:1  @P3 STG.E.U16 [C + 2x<3>], c3;
    };
+]

// Restore beta preds
--:-:-:-:1  @P5 R2P PR, preds, 0x0f;
--:-:-:-:1 @!P5 R2P PR, RZ,    0x0f;

01:-:-:-:6      IADD   C0.CC, C0, ldc8;
--:-:-:-:0      IADD.X C1,    C1, RZ;

--:-:-:-:5      RET;
